import pandas as pd
import scanpy as sc
import numpy as np
from matplotlib import pyplot as plt
from collections import OrderedDict
import os
import sys
import gc
sys.path.append("lib")
sys.path.append("../lib")
from jupytertools import setwd, fix_logging, display
from toolz.functoolz import pipe, partial
from multiprocessing import Pool
import seaborn as sns
from plotnine import ggplot, aes
import plotnine as n
import scipy.stats as stats
import itertools
setwd()
fix_logging(sc.settings)Working directory did not change because calling from nextflow.
%%R
library(conflicted)
conflict_prefer("Position", "base")
library(dplyr)
library(ggplot2)
library(ggpubr)
library(ggbeeswarm)
library(edgeR)
options(max.print=100)
options(repr.matrix.max.cols=50, repr.matrix.max.rows=6)R[write to console]: [conflicted] Will prefer base::Position over any other package
R[write to console]: Loading required package: magrittr
R[write to console]: Loading required package: limma
Index(['batch', 'samples', 'patient', 'origin', 'replicate', 'dataset',
'tumor_type', 'platform', 'age', 'sex', 'facs_purity_cd3',
'facs_purity_cd56', 'n_genes', 'mt_frac', 'n_counts', 'rk_n_counts',
'rk_n_genes', 'rk_mt_frac', 'doublet_score', 'is_doublet', 'leiden',
'S_score', 'G2M_score', 'phase', 'cell_type', 'cell_type_unknown',
'cell_type_coarse'],
dtype='object')
Note that scikit-learn's randomized PCA might not be exactly reproducible across different computational platforms. For exact reproducibility, choose `svd_solver='arpack'.` This will likely become the Scanpy default in the future.
computing PCA with n_comps = 50
finished
computing neighbors
using 'X_pca' with n_pcs = 50
finished
computing UMAP
finished
sc.pl.umap(adata, color=["samples", "cell_type"])
sc.pl.umap(adata, color=["CD4", "CD8A", "FOXP3", "PDCD1", "KLRF1"])(using combat with covariates did not work out -> singular matrix, i.e. too few samples per group)
sc.pp.highly_variable_genes(adata, flavor="cell_ranger", n_top_genes=3000)
sc.pl.highly_variable_genes(adata)extracting highly variable genes
finished
added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
computing PCA with n_comps = 50
computing PCA on highly variable genes
finished
computing neighbors
using 'X_pca' with n_pcs = 50
finished
computing UMAP
finished
sc.pl.umap(adata, color=["samples", "cell_type"])
sc.pl.umap(adata, color=["CD4", "CD8A", "FOXP3", "PDCD1", "KLRF1"])for ct, tmp_adata in adatas.items():
print("###########################\n{}\n###########################\n\n".format(ct))
sc.pp.highly_variable_genes(tmp_adata, flavor="cell_ranger", n_top_genes=2000)
sc.pl.highly_variable_genes(tmp_adata)###########################
CD4
###########################
extracting highly variable genes
finished
added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
###########################
CD8
###########################
extracting highly variable genes
finished
added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
###########################
NK
###########################
extracting highly variable genes
/home/sturm/scratch/projects/2020/kortekaas2020_paper/work/conda/run_notebook-b99176dfb05e0522988294512373169f/lib/python3.7/site-packages/scanpy/preprocessing/_highly_variable_genes.py:135: RuntimeWarning: divide by zero encountered in true_divide
) / disp_mad_bin[df['mean_bin'].values].values
/home/sturm/scratch/projects/2020/kortekaas2020_paper/work/conda/run_notebook-b99176dfb05e0522988294512373169f/lib/python3.7/site-packages/scanpy/preprocessing/_highly_variable_genes.py:135: RuntimeWarning: invalid value encountered in true_divide
) / disp_mad_bin[df['mean_bin'].values].values
finished
added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
for ct, tmp_adata in adatas.items():
print("###########################\n{}\n###########################\n\n".format(ct))
sc.pp.pca(tmp_adata, svd_solver="arpack")
sc.pp.neighbors(tmp_adata, n_neighbors=10)
sc.tl.umap(tmp_adata)
sc.pl.umap(tmp_adata, color=["samples", "cell_type"])###########################
CD4
###########################
computing PCA with n_comps = 50
computing PCA on highly variable genes
finished
computing neighbors
using 'X_pca' with n_pcs = 50
finished
computing UMAP
finished
###########################
CD8
###########################
computing PCA with n_comps = 50
computing PCA on highly variable genes
finished
computing neighbors
using 'X_pca' with n_pcs = 50
finished
computing UMAP
finished
###########################
NK
###########################
computing PCA with n_comps = 50
computing PCA on highly variable genes
finished
computing neighbors
using 'X_pca' with n_pcs = 50
finished
computing UMAP
finished
def test_leiden_thresholds(adata_key, resolutions, seeds, n_cpus):
"""
Test different leiden thresholds.
Args:
adata:
resolutions: numpy array containing all resolutions to test
seeds: numpy array containin random seeds (every resolution is tested with every seed)
p: multiprocessing.Pool
"""
args = list(itertools.product([adata_key], resolutions, seeds))
# p = lambda: None
# p.starmap = lambda x, a: [x for x in itertools.starmap(x, a)]
leiden_results = p.starmap(leiden_with_r, args)
n_clusters = [leiden_results[i].cat.categories.size for i, _ in enumerate(args)]
# re-arrange results in dataframe and aggregate by mean.
clusters = {s: dict() for s in seeds}
for i, (a, r, s) in enumerate(args):
clusters[s][r] = n_clusters[i]
clusters_mean = np.mean(pd.DataFrame.from_dict(clusters).values, axis=1)
return clusters_meanresolutions = np.arange(0.1, 1.5, 0.05)
seeds = np.arange(0, 10)
for ct, tmp_adata in adatas.items():
print("###########################\n{}\n###########################\n\n".format(ct))
clusters_mean = test_leiden_thresholds(ct, resolutions, seeds, 16)
plt.plot(resolutions, clusters_mean)
# plt.plot(resolutions, pd.Series(clusters_mean).rolling(3), color="red")
plt.xlabel("leiden resolution")
plt.ylabel("#clusters")
plt.vlines(x=leiden_thres[ct], ymin=0, ymax=plt.ylim()[1], color="grey")
plt.show()###########################
CD4
###########################
###########################
CD8
###########################
###########################
NK
###########################
There is no clear plateau... anyway, 10 clusters sounds reasonable.
for ct, tmp_adata in adatas.items():
print("###########################\n{}\n###########################\n\n".format(ct))
sc.tl.leiden(tmp_adata, resolution=leiden_thres[ct])
sc.pl.umap(tmp_adata, color="leiden", legend_loc="on data")
sc.tl.rank_genes_groups(tmp_adata, groupby="leiden")
sc.pl.rank_genes_groups(tmp_adata)###########################
CD4
###########################
running Leiden clustering
finished
ranking genes
finished
###########################
CD8
###########################
running Leiden clustering
finished
ranking genes
finished
###########################
NK
###########################
running Leiden clustering
finished
ranking genes
finished
... storing 'cluster' as categorical